Curated data

Checking availability of months of data for states in SDC, looking at the curated data S3 bucket instead of Redshift.

Building off of AWS CLI commands such as:

aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/
# for one state:

aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20171031/content/state=TX/table=alert/projection=redshift/year=2017/

Looking first at Texas, see that only one month available (December) for TX in older version of curated data:

cmd = "aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20171031/content/state=TX/table=alert/projection=redshift/year=2017/"
print(system(cmd, intern = T))
## [1] "                           PRE month=12/"

And only three months available in more recent version.

cmd = "aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180331/content/state=TX/table=alert/projection=redshift/year=2017/"
print(system(cmd, intern = T))
## [1] "                           PRE month=10/"
## [2] "                           PRE month=11/"
## [3] "                           PRE month=12/"

Same, but now compare with the recently-completed new ingestion/curation workflow, using the dev bucket. Only 2018 data is available, may and June. No 2017 yet available (2018-06-07).

cmd = "aws s3 ls s3://dev-dot-sdc-curated-911061262852-us-east-1/waze/version=20180531/content/state=NY/table=alert/projection=redshift/year=2018/"
print(system(cmd, intern = T))
## [1] "                           PRE month=01/"
## [2] "                           PRE month=03/"
## [3] "                           PRE month=05/"
## [4] "                           PRE month=06/"
## [5] "                           PRE month=07/"
cmd = "aws s3 ls s3://dev-dot-sdc-curated-911061262852-us-east-1/waze/version=20180531/content/state=TX/table=alert/projection=redshift/year=2018/"
print(system(cmd, intern = T))
## [1] "                           PRE month=03/"
## [2] "                           PRE month=05/"
## [3] "                           PRE month=06/"
## [4] "                           PRE month=07/"

2018 Dev

# Loop over states to find which ones have complete data in 2018:
avail.mo = vector()
use.states = c(state.abb, c("DC", "CA1", "CA2", "CA3"))
find.months = formatC(1:12, width = 2, flag = "0")
for(i in use.states){
  cmd = paste0('aws s3 ls s3://dev-dot-sdc-curated-911061262852-us-east-1/waze/version=20180531/content/state=', i ,'/table=alert/projection=redshift/year=2018/')
  mo_i <- system(cmd, intern = T)
  avail.mo_i <- substr(mo_i, start =nchar(mo_i[1])-2, stop = nchar(mo_i[1])-1)
  
  avail.mo = rbind(avail.mo, c(i, find.months %in% avail.mo_i))
}

avail.mo <- as.data.frame(avail.mo)
colnames(avail.mo) = c("State", find.months)
DT::datatable(avail.mo) %>% formatStyle(2:13, background = styleEqual('TRUE', 'lightgreen'))

2018 New Prod

# Loop over states to find which ones have complete data in 2018, using new production version
avail.mo = vector()
use.states = sort(c(state.abb, c("DC", "CA1", "CA2", "CA3")))
find.months = formatC(1:12, width = 2, flag = "0")
for(i in use.states){
  cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180720/content/state=', i ,'/table=alert/projection=redshift/year=2018/')
  mo_i <- system(cmd, intern = T)
  avail.mo_i <- substr(mo_i, start =nchar(mo_i[1])-2, stop = nchar(mo_i[1])-1)
  
  avail.mo = rbind(avail.mo, c(i, find.months %in% avail.mo_i))
}

avail.mo <- as.data.frame(avail.mo)
colnames(avail.mo) = c("State", find.months)
DT::datatable(avail.mo) %>% formatStyle(2:13, background = styleEqual('TRUE', 'lightgreen'))

Tracking progress of curation for by day. Look to see how many days within each month have at least one record

# Loop over states and months, to find which ones have complete days of data in 2018, using new production version.
# Also track object number and size of month folder. 
# Could make faster by making state/month commands pasted together and doing just one loop
avail.mo = size.mo = obj.mo = vector()

find.months = formatC(1:12, width = 2, flag = "0")
use.states = sort(c(state.abb, c("DC", "CA1", "CA2", "CA3")))

for(state in use.states){
  cat(state, "\n")
  for(i in find.months){ # i = "01"; state = "DC"
    cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180720/content/state=', state ,'/table=alert/projection=redshift/year=2018/month=', i, "/")
    mo_i <- system(cmd, intern = T)
    avail.day_i <- substr(mo_i, start = nchar(mo_i[1])-2, stop = nchar(mo_i[1])-1)
    
    
    size_cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180720/content/state=', state ,'/table=alert/projection=redshift/year=2018/month=', i, "/ --recursive --summarize --human-readable ")
    size_i <- system(size_cmd, intern = T)
    
    avail.mo = c(avail.mo, max(as.numeric(avail.day_i)))
    size.mo = c(size.mo, gsub("^(\\s*)+Total Size: ", "", size_i[length(size_i)]))
    obj.mo = c(obj.mo, gsub("^(\\s*)+Total Objects: ", "", size_i[length(size_i)-1]))

  }
}
## AK 
## AL 
## AR 
## AZ 
## CA 
## CA1 
## CA2 
## CA3 
## CO 
## CT 
## DC 
## DE 
## FL 
## GA 
## HI 
## IA 
## ID 
## IL 
## IN 
## KS 
## KY 
## LA 
## MA 
## MD 
## ME 
## MI 
## MN 
## MO 
## MS 
## MT 
## NC 
## ND 
## NE 
## NH 
## NJ 
## NM 
## NV 
## NY 
## OH 
## OK 
## OR 
## PA 
## RI 
## SC 
## SD 
## TN 
## TX 
## UT 
## VA 
## VT 
## WA 
## WI 
## WV 
## WY
avail.mo[avail.mo==-Inf] = NA

df <- data.frame(State = rep(use.states, each = length(find.months)),
                 Month = rep(find.months, times = length(use.states)),
                 Days = as.numeric(avail.mo),
                 Size = size.mo,
                 Objects = as.numeric(obj.mo))

df$Value = paste(df$Days, df$Size, df$Objects, sep = " \t")
df$Value[is.na(df$Days)] = NA

df2 <- df %>% 
  select(State, Month, Value) %>%
  spread(key = Month, value = Value)

DT::datatable(df2)# %>% formatStyle(2:13, color = styleEqual('<NA>', 'grey20'))

Visual representation

library(plotly)
#ggplot(df, aes(x = Month, y = Objects, fill = State)) +geom_dotplot(binaxis = "y", stackdir='center', position = 'dodge')

df$usetext <- with(df, paste(State, "\n", Month, "\n",  format(Objects, big.mark = ",", digits = 0, scientific = F), "Objects", "\n", Size))

gp.focal <- ggplot(df %>% filter(State %in% c("AK", "AL", "CT", "DC", "FL", "IL", "IN", "MA", "MD", "NY", "PA", "TN", "TX", "UT", "WY")),
             aes(x = Month, 
                 y = Objects,
                 group = State,
                 text = usetext)) + theme_bw()

gp.focal2 <- gp.focal + geom_line(aes(color = State), 
                                  alpha = 0.8,
                                  size = 2) +
  theme(axis.text.x = element_text(size=7, angle=45),
        axis.text.y = element_text(size=7, angle=45)) +
  ylab("Count of Objects in S3") +
  xlab(" ")  + 
  guides(color=guide_legend(title="State"))
#gp.focal2

ggplotly(gp.focal2, tooltip = "usetext")

July data… what is being curated now?

i = "07"
for(state in use.states){
  cat(state, "\n")
    cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180720/content/state=', state ,'/table=alert/projection=redshift/year=2018/month=', i, "/")
    mo_i <- system(cmd, intern = T)
    avail.day_i <- substr(mo_i, start = nchar(mo_i[1])-2, stop = nchar(mo_i[1])-1)
    
    
    size_cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180720/content/state=', state ,'/table=alert/projection=redshift/year=2018/month=', i, '/ --recursive --summarize --human-readable')
    (size_i <- system(size_cmd, intern = T))
    
    avail.mo = rbind(avail.mo, c(i, max(as.numeric(avail.day_i))))
    size.mo = rbind(size.mo, gsub("^(\\s*)+Total Size: ", "", size_i[length(size_i)]))
    obj.mo = rbind(obj.mo, gsub("^(\\s*)+Total Objects: ", "", size_i[length(size_i)-1]))

  
}

avail.mo[avail.mo==-Inf] = NA

2017

Now looping across all ‘states’ in the curated data:

# Loop over states to find which ones have month directories:
# can also try previous version, 20171031
avail.mo = vector()
use.states = c(state.abb, c("DC", "CA1", "CA2", "CA3"))
find.months = formatC(1:12, width = 2, flag = "0")
for(i in use.states){
  cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180331/content/state=', i ,'/table=alert/projection=redshift/year=2017/')
  mo_i <- system(cmd, intern = T)
  avail.mo_i <- substr(mo_i, start =nchar(mo_i[1])-2, stop = nchar(mo_i[1])-1)
  
  avail.mo = rbind(avail.mo, c(i, find.months %in% avail.mo_i))
}

avail.mo <- as.data.frame(avail.mo)
colnames(avail.mo) = c("State", find.months)
DT::datatable(avail.mo) %>% formatStyle(2:13, background = styleEqual('TRUE', 'lightgreen'))

Look at the previous version of the database, from 2017-10-31:

# Loop over states to find which ones have month directories:
# can also try previous version, 20171031
avail.mo = vector()
use.states = c(state.abb, c("DC", "CA1", "CA2", "CA3"))
find.months = formatC(1:12, width = 2, flag = "0")
for(i in use.states){
  cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20171031/content/state=', i ,'/table=alert/projection=redshift/year=2017/')
  mo_i <- system(cmd, intern = T)
  avail.mo_i <- substr(mo_i, start =nchar(mo_i[1])-2, stop = nchar(mo_i[1])-1)
  
  avail.mo = rbind(avail.mo, c(i, find.months %in% avail.mo_i))
}

avail.mo <- as.data.frame(avail.mo)
colnames(avail.mo) = c("State", find.months)
datatable(avail.mo) %>% formatStyle(2:13, background = styleEqual('TRUE', 'lightgreen'))

2018

# Loop over states to find which ones have complete data in 2018:
avail.mo = vector()
use.states = c(state.abb, c("DC", "CA1", "CA2", "CA3"))
find.months = formatC(1:12, width = 2, flag = "0")
for(i in use.states){
  cmd = paste0('aws s3 ls s3://prod-dot-sdc-curated-911061262852-us-east-1/waze/version=20180331/content/state=', i ,'/table=alert/projection=redshift/year=2018/')
  mo_i <- system(cmd, intern = T)
  avail.mo_i <- substr(mo_i, start =nchar(mo_i[1])-2, stop = nchar(mo_i[1])-1)
  
  avail.mo = rbind(avail.mo, c(i, find.months %in% avail.mo_i))
}

avail.mo <- as.data.frame(avail.mo)
colnames(avail.mo) = c("State", find.months)
DT::datatable(avail.mo) %>% formatStyle(2:13, background = styleEqual('TRUE', 'lightgreen'))